# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.style as style
style.use('fivethirtyeight')
#statistics
from scipy import stats
from scipy.stats import shapiro
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from statsmodels.stats.outliers_influence import variance_inflation_factor
#set maximum columns and rows
pd.set_option('display.max_columns',1000)
# Feature Selection and Encoding
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,StandardScaler
from sklearn.preprocessing import RobustScaler
# Machine learning
from sklearn import model_selection,preprocessing, metrics, linear_model
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso,RidgeCV,LassoCV,ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from xgboost.sklearn import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
from sklearn.pipeline import make_pipeline
# Grid and Random Search
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc,r2_score,mean_squared_error
# Managing Warnings
import warnings
warnings.filterwarnings('ignore')
# Plot the Figures Inline
%matplotlib inline
#plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
train =pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
def info(df):
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary[['Name','dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Total'] = len(df)
summary['Missing Percentage']=np.round(((summary['Missing']/summary['Total'])*100),2)
summary['Uniques'] = df.nunique().values
mask=summary[summary['Missing Percentage']>0].sort_values(by=['Missing Percentage'],ascending=False)
plt.figure(figsize=(7,7))
sns.barplot(x=mask['Name'],y=mask['Missing Percentage'])
plt.xticks(rotation=90)
plt.axhline(50)
plt.show()
return(summary)
info(train)
info(test)
fig = px.histogram(train, x="SalePrice",
marginal="box", # or violin, rug
hover_data=train)
fig.update_layout(title_text='SalePrice')
fig.show()
fig,ax = plt.subplots(constrained_layout=True,figsize=(10,7))
stats.probplot(train['SalePrice'],plot=ax)
plt.show()
stat,pvalue=shapiro(train['SalePrice'])
print('p value',pvalue)
HERE p-value < alpha(0.05) SO REJECT NULL HYPOTHESIS. HENCE DATA IS NOT NORMALLY DISTRIBUTED.
print('Skewness:',train['SalePrice'].skew())
Here The Skewness value is positive so The Target column (SalePrice) is Right skewed
train['SalePrice']=np.log1p(train['SalePrice'])
fig,ax = plt.subplots(constrained_layout=True,figsize=(10,7))
stats.probplot(train['SalePrice'],plot=ax)
plt.show()
fig = px.histogram(train, x="GrLivArea",
marginal="box", # or violin, rug
hover_data=train)
fig.update_layout(title_text='GrLivArea')
fig.show()
# Deleting outliers
train = train[train['GrLivArea'] < 4500]
train.reset_index(drop=True, inplace=True)
train.shape
df=pd.concat([train,test],axis=0)
df.shape
df.drop(['SalePrice','Id'],axis=1,inplace=True)
info(df)
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)
df['Functional'] = df['Functional'].fillna('Typ')
df['Electrical'] = df['Electrical'].fillna("SBrkr")
df['KitchenQual'] = df['KitchenQual'].fillna("TA")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
df[col] = df[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
df[col] = df[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
df[col] = df[col].fillna('None')
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
objects = []
for i in df.columns:
if df[i].dtype == object:
objects.append(i)
df.update(df[objects].fillna('None'))
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in df.columns:
if df[i].dtype in numeric_dtypes:
numerics.append(i)
df.update(df[numerics].fillna(0))
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in df.columns:
if df[i].dtype in numeric_dtypes:
numerics2.append(i)
skew_features=df[numerics2].skew()
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
for i in skew_index:
df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))
df = df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
df['1stFlrSF'] + df['2ndFlrSF'])
df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
df['EnclosedPorch'] + df['ScreenPorch'] +
df['WoodDeckSF'])
df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
print(df.shape)
df=pd.get_dummies(df)
print(df.shape)
train.shape
Train=df.iloc[0:1458,:]
Test=df.iloc[1458:,:]
Train['SalePrice']=train['SalePrice']
X=Train.drop('SalePrice',axis=1)
y=Train['SalePrice']
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])
overfit = []
for i in X.columns:
counts = X[i].value_counts()
zeros = counts.iloc[0]
if zeros / len(X) * 100 > 99.94:
overfit.append(i)
overfit = list(overfit)
overfit.append('MSZoning_C (all)')
overfit
X = X.drop(overfit, axis=1)
Test = Test.drop(overfit, axis=1)
print('X', X.shape, 'y', y.shape, 'Test', Test.shape)
#CREATE PIPELINE
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
ridge = make_pipeline(RobustScaler(),
RidgeCV(alphas=alphas_alt))
lasso = make_pipeline(RobustScaler(),
LassoCV(max_iter=1e7, alphas=alphas2,
random_state=42))
elasticnet = make_pipeline(RobustScaler(),
ElasticNetCV(max_iter=1e7, alphas=e_alphas,
random_state=42, l1_ratio=e_l1ratio))
svr = make_pipeline(RobustScaler(),
SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
gbr = make_pipeline(RobustScaler(),GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =42))
lightgbm =make_pipeline(RobustScaler(), lgb.LGBMRegressor(objective='regression',
num_leaves=4,
learning_rate=0.01,
n_estimators=5000,
max_bin=200,
bagging_fraction=0.75,
bagging_freq=5,
bagging_seed=7,
feature_fraction=0.2,
feature_fraction_seed=7,
verbose=-1,
#min_data_in_leaf=2,
#min_sum_hessian_in_leaf=11
))
xgboost = make_pipeline(RobustScaler(),XGBRegressor(learning_rate=0.01, n_estimators=3460,
max_depth=3, min_child_weight=0,
gamma=0, subsample=0.7,
colsample_bytree=0.7,
objective='reg:linear', nthread=-1,
scale_pos_weight=1, seed=27,
reg_alpha=0.00006, random_state=42))
# stacking
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
gbr, xgboost, lightgbm),
meta_regressor=xgboost,
use_features_in_secondary=True)
pipelines = []
pipelines.append(('ScaledLASSO', lasso))
pipelines.append(('Scaledelasticnet', elasticnet))
pipelines.append(('Scaledsvr', svr))
pipelines.append(('Scaledgbr', gbr))
pipelines.append(('Scaledlightgbm', lightgbm))
#pipelines.append(('Scaledstack_gen', stack_gen))
for name,model in pipelines:
results=[]
names=[]
kfold = model_selection.KFold(n_splits=10, random_state=1,)
cv_results = model_selection.cross_val_score(model, X, y, cv=kfold,scoring = 'neg_mean_squared_error')
results.append(np.sqrt(-cv_results))
names.append(name)
print(name)
mse_scores = abs(cv_results)
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)
print('BIAS ERROR:',rmse_scores.mean())
print('VARIANCE ERROR:',np.var(rmse_scores,ddof=1))
results.append(rmse_scores)
names.append(name)
from datetime import datetime
print('START Fit')
print(datetime.now(), 'StackingCVRegressor')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print(datetime.now(), 'elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
# FUNCTION TO CALCULATE RMSE
def rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
def blend_models_predict(X):
return ((0.1 * elastic_model_full_data.predict(X)) + \
(0.05 * lasso_model_full_data.predict(X)) + \
(0.1 * ridge_model_full_data.predict(X)) + \
(0.1 * svr_model_full_data.predict(X)) + \
(0.1 * gbr_model_full_data.predict(X)) + \
(0.15 * xgb_model_full_data.predict(X)) + \
(0.1 * lgb_model_full_data.predict(X)) + \
(0.3 * stack_gen_model.predict(np.array(X))))
print('RMSE score on train data:')
print(rmse(y, blend_models_predict(X)))
print('Predict submission')
y_pred=blend_models_predict(Test)
y_pred1=np.expm1(y_pred)
y_pred1
test['SalePrice']=y_pred1
a=test.loc[:,['Id','SalePrice']]
print(a)
#a.to_csv('Submission17.csv',index=False)